This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
options(java.parameters='-Xmx8g')
library(parallelMap)
parallelStartSocket(4)
## Starting parallelization in mode=socket with cpus=4.
library(data.table)
library(xgboost)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
##
## slice
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(mlr)
## Loading required package: ParamHelpers
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.3-9, (SVN revision 794)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.1.3, released 2017/20/01
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.5/Resources/library/rgdal/gdal
## GDAL binary built with GEOS: FALSE
## Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
## Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.5/Resources/library/rgdal/proj
## Linking to sp version: 1.3-1
library(GISTools)
## Loading required package: maptools
## Checking rgeos availability: TRUE
## Loading required package: RColorBrewer
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: rgeos
## rgeos version: 0.4-2, (SVN revision 581)
## GEOS runtime version: 3.6.1-CAPI-1.10.1
## Linking to sp version: 1.3-1
## Polygon checking: TRUE
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:GISTools':
##
## map.scale
library(ggplot2)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(rsample)
## Loading required package: tidyr
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
library(raster)
##
## Attaching package: 'raster'
## The following object is masked from 'package:tidyr':
##
## extract
## The following objects are masked from 'package:MASS':
##
## area, select
## The following object is masked from 'package:mlr':
##
## resample
## The following object is masked from 'package:ParamHelpers':
##
## getValues
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:data.table':
##
## shift
a<-fread("~/Documents/challenge/new-york-city-taxi-fare-prediction/train.csv",nrows=1000000)
b<-fread("~/Documents/challenge/new-york-city-taxi-fare-prediction/test.csv")
summarizeColumns(a)
## name type na mean disp median mad
## 1 key character 0 NA 0.999999 NA NA
## 2 fare_amount numeric 0 11.348079 9.822090 8.50000 4.44780000
## 3 pickup_datetime character 0 NA 0.999991 NA NA
## 4 pickup_longitude numeric 0 -72.526640 12.057937 -73.98179 0.01769928
## 5 pickup_latitude numeric 0 39.929008 7.626154 40.75270 0.02353183
## 6 dropoff_longitude numeric 10 -72.527860 11.324494 -73.98014 0.01911071
## 7 dropoff_latitude numeric 10 39.919954 8.201418 40.75317 0.02478314
## 8 passenger_count integer 0 1.684924 1.323911 1.00000 0.00000000
## min max nlevs
## 1 1.000 1.00000 1000000
## 2 -44.900 500.00000 0
## 3 1.000 9.00000 861755
## 4 -3377.681 2522.27133 0
## 5 -3116.285 2621.62843 0
## 6 -3383.297 45.58162 0
## 7 -3114.339 1651.55343 0
## 8 0.000 208.00000 0
# deal with NA vaule of train
apply(a, 2, function(x){sum(is.na(x))})
## key fare_amount pickup_datetime pickup_longitude
## 0 0 0 0
## pickup_latitude dropoff_longitude dropoff_latitude passenger_count
## 0 10 10 0
a<-na.omit(a)
##########################################################
##google map
register_google(key = "AIzaSyCUxIR2wAfQ6C8qpcIRwSPVigHK1skRuS8")
newyork_map<-get_googlemap(center = c(lon=-74,lat=40.7),zoom = 10,maptype = 'roadmap')
## Source : https://maps.googleapis.com/maps/api/staticmap?center=40.7,-74&zoom=10&size=640x640&scale=2&maptype=roadmap&key=xxx
ggmap(newyork_map)
ggmap(newyork_map)+geom_point(data=a,aes(x=pickup_longitude,y=pickup_latitude,color='red'),alpha=0.08)+labs(title = "location of pickup points")
## Warning: Removed 20291 rows containing missing values (geom_point).
ggmap(newyork_map)+geom_point(data=a,aes(x=dropoff_longitude,y=dropoff_latitude,color='blue'),alpha=0.08)+labs(title = "location of dropoff points")
## Warning: Removed 20302 rows containing missing values (geom_point).
##########################################################
##spatial data analysis and delete some points in the water
##visualization
setwd("~/Documents/challenge/DATApro")
ny_map<- readOGR(dsn="~/Documents/challenge/DATApro", layer = "Export_Output")
## OGR data source with driver: ESRI Shapefile
## Source: "/Users/yangwang/Documents/challenge/DATApro", layer: "Export_Output"
## with 10 features
## It has 55 fields
## Integer64 fields read as strings: OBJECTID POPULATION POP2010 WHITE BLACK AMERI_ES ASIAN HAWN_PI HISPANIC OTHER MULT_RACE MALES FEMALES AGE_UNDER5 AGE_5_9 AGE_10_14 AGE_15_19 AGE_20_24 AGE_25_34 AGE_35_44 AGE_45_54 AGE_55_64 AGE_65_74 AGE_75_84 AGE_85_UP HOUSEHOLDS HSEHLD_1_M HSEHLD_1_F MARHH_CHD MARHH_NO_C MHH_CHILD FHH_CHILD FAMILIES HSE_UNITS VACANT OWNER_OCC RENTER_OCC
plot(ny_map, axes = TRUE, col = "grey",xlim = c(-74.3,-73.7),
ylim = c(+40,+41.5))
title('location of pickup points')
box()
points(a$pickup_longitude, a$pickup_latitude,
col = "red", pch = 20, cex = 0.5)
plot(ny_map, axes = TRUE, col = "grey",xlim = c(-74.3,-73.7),
ylim = c(+40,+41.5))
title('location of dropoff points')
box()
points(a$dropoff_longitude, a$dropoff_latitude,
col = "red", pch = 20, cex = 0.5)
##########################################################
## delete points
sp_pickup <- SpatialPoints(data.frame(a$pickup_longitude, a$pickup_latitude), proj4string=CRS(proj4string(ny_map)))
spdf_pickup <- sp_pickup[ny_map]
plot(ny_map, axes = TRUE, col = "grey",xlim = c(-74.3,-73.7),
ylim = c(+40,+41.5))
title('location of pickup points')
box()
points(spdf_pickup, col = "red", pch = 20, cex = 0.5)
title('location of pickup points')
x<-spdf_pickup@coords[,1]
y<-spdf_pickup@coords[,2]
xy<-vector()
##congzheli
sp_drop <- SpatialPoints(data.frame(a$dropoff_longitude, a$dropoff_latitude), proj4string=CRS(proj4string(ny_map)))
spdf_drop <- sp_drop[ny_map]
plot(ny_map, axes = TRUE, col = "grey",xlim = c(-74.3,-73.7),
ylim = c(+40,+41.5))
title('location of dropoff points')
box()
points(spdf_drop, col = "red", pch = 20, cex = 0.5)
title('location of pickup points')
x<-spdf_pickup@coords[,1]
y<-spdf_pickup@coords[,2]
a<-a%>%
filter(pickup_longitude > -80 & pickup_longitude < -70) %>%
filter(pickup_latitude > 35 & pickup_latitude < 45) %>%
filter(dropoff_longitude > -80 & dropoff_longitude < -70) %>%
filter(dropoff_latitude > 35 & dropoff_latitude < 45)
##########################################################
## change time
a<-a%>%
mutate(
pickup_datetime = ymd_hms(pickup_datetime),
year = as.numeric(year(pickup_datetime)),
month = as.numeric(month(pickup_datetime)),
day = as.numeric(day(pickup_datetime)),
dayOfWeek = as.numeric(wday(pickup_datetime)),
hour = as.numeric(hour(pickup_datetime)),
minute = as.numeric(minute(pickup_datetime))
)
a<-a[,-c(1,3)]
##########################################################
for(i in 1:10){
dv<-mapdist(c(a$pickup_longitude[i],a$pickup_latitude[i]), c(a$dropoff_longitude[i],a$dropoff_latitude[i]), mode = "driving")
a$dis[i]=dv$miles
}
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.721319,-73.844311&key=xxx
## Multiple addresses found, the first will be returned:
## 107-60 Queens Blvd, Forest Hills, NY 11375, USA
## 107-72 Queens Blvd, Forest Hills, NY 11375, USA
## Forest Hills - 71 Av, Queens, NY 11375, USA
## Queens Blvd, Forest Hills, NY 11375, USA
## Forest Hills, Queens, NY 11375, USA
## Forest Hills, NY 11375, USA
## Queens, NY, USA
## Queens County, Queens, NY, USA
## New York, NY, USA
## Long Island, New York, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.712278,-73.84161&key=xxx
## Multiple addresses found, the first will be returned:
## 111 Puritan Avenue Suite 3d, Forest Hills, NY 11375, Forest Hills, NY 11375, United States
## 1-65 Greenway S, Flushing, NY 11375, USA
## 154 Puritan Ave, Forest Hills, NY 11375, USA
## 75 Puritan Ave, Forest Hills, NY 11375, USA
## 1-144 Puritan Ave, Forest Hills, NY 11375, USA
## Forest Hills, Queens, NY 11375, USA
## Forest Hills, NY 11375, USA
## Queens, NY, USA
## Queens County, Queens, NY, USA
## New York, NY, USA
## Long Island, New York, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=107-60+Queens+Blvd,+Forest+Hills,+NY+11375,+USA&destinations=111+Puritan+Avenue+Suite+3d,+Forest+Hills,+NY+11375,+Forest+Hills,+NY+11375,+United+States&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.711303,-74.016048&key=xxx
## Multiple addresses found, the first will be returned:
## 395 South End Ave, New York, NY 10280, USA
## 389 South End Ave, New York, NY 10280, USA
## 399-375 South End Ave, New York, NY 10280, USA
## Battery Park City, New York, NY, USA
## New York, NY 10280, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.782004,-73.979268&key=xxx
## Multiple addresses found, the first will be returned:
## 364 Amsterdam Ave, New York, NY 10024, USA
## Amsterdam Av/W 77 St, New York, NY 10024, USA
## 378-358 Amsterdam Ave, New York, NY 10024, USA
## New York, NY 10024, USA
## Upper West Side, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=395+South+End+Ave,+New+York,+NY+10280,+USA&destinations=364+Amsterdam+Ave,+New+York,+NY+10024,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.76127,-73.982738&key=xxx
## Multiple addresses found, the first will be returned:
## 150 W 51st St, New York, NY 10019, USA
## 76179 7th Ave, New York, NY 10019, USA
## 761 7th Ave, New York, NY 10019, USA
## 782-770 7th Ave, New York, NY 10019, USA
## New York, NY 10020, USA
## Theater District, New York, NY, USA
## Midtown Manhattan, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.750562,-73.991242&key=xxx
## Multiple addresses found, the first will be returned:
## 420 7th Ave, New York, NY 10119, USA
## 34 St - Penn Station, New York, NY 10120, USA
## 202 W 34th St, New York, NY 10119, USA
## 208 W 33rd St, New York, NY 10001, USA
## 220-200 W 33rd St, New York, NY 10001, USA
## New York, NY 10119, USA
## Midtown South, New York, NY, USA
## Chelsea, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=150+W+51st+St,+New+York,+NY+10019,+USA&destinations=420+7th+Ave,+New+York,+NY+10119,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.733143,-73.98713&key=xxx
## Multiple addresses found, the first will be returned:
## E 14 St/3 Av, New York, NY 10003, USA
## 200 E 14th St, New York, NY 10003, USA
## 123 3rd Ave, New York, NY 10003, USA
## 111 3rd Ave, New York, NY 10003, USA
## 122 3rd Ave, New York, NY 10003, USA
## 200-298 E 14th St, New York, NY 10003, USA
## Ukrainian Village, New York, NY, USA
## New York, NY 10003, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.758092,-73.991567&key=xxx
## Multiple addresses found, the first will be returned:
## 345w W 42nd St, New York, NY 10036, USA
## 345 W 42nd St, New York, NY 10036, USA
## W 42nd St & Port Authority Terminal, W 42nd St, New York, NY 10036, USA
## 343 W 42nd St, New York, NY 10036, USA
## 348-330 W 42nd St, New York, NY 10036, USA
## New York, NY 10036, USA
## Midtown South, New York, NY, USA
## Midtown Manhattan, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=E+14+St/3+Av,+New+York,+NY+10003,+USA&destinations=345w+W+42nd+St,+New+York,+NY+10036,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.768008,-73.968095&key=xxx
## Multiple addresses found, the first will be returned:
## 773 Madison Ave, New York, NY 10065, USA
## Madison Av/E 66 St, New York, NY 10065, USA
## 21 E 66th St, New York, NY 10065, USA
## 773-791 Madison Ave, New York, NY 10065, USA
## New York, NY 10065, USA
## Lenox Hill, New York, NY, USA
## Central Park West Historic District, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.783762,-73.956655&key=xxx
## Multiple addresses found, the first will be returned:
## 1268 Madison Ave, New York, NY 10128, USA
## 46 E 91st St, New York, NY 10128, USA
## 1263 Madison Ave, New York, NY 10128, USA
## 1272-1258 Madison Ave, New York, NY 10128, USA
## Carnegie Hill, New York, NY, USA
## New York, NY 10128, USA
## Upper East Side, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=773+Madison+Ave,+New+York,+NY+10065,+USA&destinations=1268+Madison+Ave,+New+York,+NY+10128,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.73163,-74.000964&key=xxx
## Multiple addresses found, the first will be returned:
## 335 6th Ave, New York, NY 10014, USA
## 333 6th Ave, New York, NY 10014, USA
## West 4 St-Washington Sq Sta, O, NY 10012, United States
## 339 6th Ave, New York, NY 10014, USA
## 341-321 6th Ave, New York, NY 10014, USA
## New York, NY 10012, USA
## West Village, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.758233,-73.972892&key=xxx
## Multiple addresses found, the first will be returned:
## 361 Park Ave, New York, NY 10154, USA
## Park Ave, E 52nd St, New York, NY 10022, United States
## 345 Park Avenue, 345 Park Ave, New York, NY 10154, USA
## 104 E 52nd St, New York, NY 10022, USA
## 100-134 E 52nd St, New York, NY 10022, USA
## New York, NY 10154, USA
## Midtown East, New York, NY, USA
## Midtown Manhattan, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=335+6th+Ave,+New+York,+NY+10014,+USA&destinations=361+Park+Ave,+New+York,+NY+10154,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.751662,-73.980002&key=xxx
## Multiple addresses found, the first will be returned:
## 295 Madison Ave, New York, NY 10017, USA
## Madison Av & East 40 St, New York, NY 10017, USA
## 287 Madison Ave, New York, NY 10017, USA
## 286 Madison Ave, New York, NY 10017, USA
## 298-282 Madison Ave, New York, NY 10017, USA
## Murray Hill, New York, NY, USA
## New York, NY 10017, USA
## Midtown Manhattan, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.764842,-73.973802&key=xxx
## Multiple addresses found, the first will be returned:
## 5 Avenue Station, 1-5, E 59th St, New York, NY 10022, USA
## 2 Central Park S, New York, NY 10019, USA
## Plaza Food Hall, 1 W 59th St, New York, NY 10019, United States
## 4 W 59th St, New York, NY 10019, USA
## W 59th St, New York, NY 10019, USA
## New York, NY 10019, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=295+Madison+Ave,+New+York,+NY+10017,+USA&destinations=5+Avenue+Station,+1-5,+E+59th+St,+New+York,+NY+10022,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.774138,-73.9513&key=xxx
## Multiple addresses found, the first will be returned:
## 1576 1st Avenue, New York, NY 10028, USA
## 1577 1st Avenue, New York, NY 10028, USA
## 1571 1st Avenue, New York, NY 10028, USA
## 1560-1576 1st Avenue, New York, NY 10028, USA
## Yorkville, New York, NY, USA
## New York, NY 10028, USA
## Upper East Side, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.751048,-73.990095&key=xxx
## Multiple addresses found, the first will be returned:
## 457 7th Ave, New York, NY 10001, USA
## 34 Street Penn Station, New York, NY 10001, USA
## 179 W 34th St, New York, NY 10001, USA
## 198-168 W 34th St, New York, NY 10001, USA
## Garment District, New York, NY, USA
## New York, NY 10001, USA
## Midtown Manhattan, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=1576+1st+Avenue,+New+York,+NY+10028,+USA&destinations=457+7th+Ave,+New+York,+NY+10001,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.726713,-74.006462&key=xxx
## Multiple addresses found, the first will be returned:
## 74 Charlton St, New York, NY 10014, USA
## 61 Vandam St, New York, NY 10013, USA
## 76-46 Vandam St, New York, NY 10013, USA
## Hudson Square, New York, NY, USA
## New York, NY 10013, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.731628,-73.993078&key=xxx
## Multiple addresses found, the first will be returned:
## 55 E 9th St, New York, NY 10003, USA
## 40 E 9th St, New York, NY 10003, USA
## 51 E 9th St, New York, NY 10003, USA
## 99-29 E 9th St, New York, NY 10003, USA
## New York, NY 10003, USA
## Greenwich Village, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=74+Charlton+St,+New+York,+NY+10014,+USA&destinations=55+E+9th+St,+New+York,+NY+10003,+USA&key=xxx&mode=driving
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.733873,-73.980658&key=xxx
## Multiple addresses found, the first will be returned:
## 1 Ave & E 18 St, New York, NY 10009, United States
## 313 1st Avenue, New York, NY 10003, USA
## 310 1st Avenue, New York, NY 10009, USA
## Stuyvesant Town-Peter Cooper Village, New York, NY, USA
## New York, NY 10009, USA
## Midtown Manhattan, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/geocode/json?latlng=40.758138,-73.99154&key=xxx
## Multiple addresses found, the first will be returned:
## 345 W 42nd St, New York, NY 10036, USA
## 345w W 42nd St, New York, NY 10036, USA
## W 42nd St & Port Authority Terminal, W 42nd St, New York, NY 10036, USA
## 343 W 42nd St, New York, NY 10036, USA
## 348-330 W 42nd St, New York, NY 10036, USA
## New York, NY 10036, USA
## Midtown South, New York, NY, USA
## Midtown Manhattan, New York, NY, USA
## Manhattan, New York, NY, USA
## New York County, New York, NY, USA
## New York, NY, USA
## New York, USA
## United States
## Source : https://maps.googleapis.com/maps/api/distancematrix/json?origins=1+Ave+&+E+18+St,+New+York,+NY+10009,+United+States&destinations=345+W+42nd+St,+New+York,+NY+10036,+USA&key=xxx&mode=driving
####### example ########
APIdist<-read.csv('/Users/yangwang/Documents/challenge/mapdistance.csv',header = FALSE)
a$dis=APIdist$V2##mile
##########################################################
## distance from airport
ldg <- as.numeric(geocode("LGD, ny"))
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=LGD,+ny&key=xxx
## "LGD, ny" not uniquely geocoded, using "1449 37th st, brooklyn, ny 11218, usa"
jfk<- as.numeric(geocode("JFK, ny"))
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=JFK,+ny&key=xxx
ewr<-as.numeric(geocode("EWR, ny"))
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=EWR,+ny&key=xxx
start<-data.frame(a$pickup_longitude,a$pickup_latitude)
end<-data.frame(a$dropoff_longitude,a$dropoff_latitude)
a$start_lgddis=pointDistance(ldg,start,lonlat=TRUE)
a$end_lgddis=pointDistance(ldg,end,lonlat=TRUE)
a$start_jfkdis=pointDistance(jfk,start,lonlat=TRUE)
a$end_jfkdis=pointDistance(jfk,end,lonlat=TRUE)
a$start_ewrdis=pointDistance(ewr,start,lonlat=TRUE)
a$end_ewrdis=pointDistance(ewr,end,lonlat=TRUE)
a<-a %>%
mutate(lgds=ifelse(start_lgddis<=10000,1,0))
a<-a%>%
mutate(lgde=ifelse(end_lgddis<=10000,1,0))
a<-a %>%
mutate(jfks=ifelse(start_jfkdis<=10000,1,0))
a<-a %>%
mutate(jfke=ifelse(end_jfkdis<=10000,1,0))
a<-a %>%
mutate(ewrs=ifelse(start_ewrdis<=10000,1,0))
a<-a %>%
mutate(ewre=ifelse(end_ewrdis<=10000,1,0))
a<-a[,-c(14:19)]
##########################################################
##split a
same<-vector()
k=1
set.seed(1)
a_split_table<-initial_split(a,prop = 0.8)
a_train<-training(a_split_table)
a_test<-testing(a_split_table)
##########################################################
summary(a_train)
## fare_amount pickup_longitude pickup_latitude dropoff_longitude
## Min. :-44.90 Min. :-78.73 Min. :37.24 Min. :-78.73
## 1st Qu.: 6.00 1st Qu.:-73.99 1st Qu.:40.74 1st Qu.:-73.99
## Median : 8.50 Median :-73.98 Median :40.75 Median :-73.98
## Mean : 11.33 Mean :-73.98 Mean :40.75 Mean :-73.97
## 3rd Qu.: 12.50 3rd Qu.:-73.97 3rd Qu.:40.77 3rd Qu.:-73.97
## Max. :500.00 Max. :-70.26 Max. :43.21 Max. :-70.05
## dropoff_latitude passenger_count year month
## Min. :37.24 Min. :0.000 Min. :2009 Min. : 1.000
## 1st Qu.:40.74 1st Qu.:1.000 1st Qu.:2010 1st Qu.: 3.000
## Median :40.75 Median :1.000 Median :2012 Median : 6.000
## Mean :40.75 Mean :1.685 Mean :2012 Mean : 6.271
## 3rd Qu.:40.77 3rd Qu.:2.000 3rd Qu.:2013 3rd Qu.: 9.000
## Max. :44.60 Max. :6.000 Max. :2015 Max. :12.000
## day dayOfWeek hour minute
## Min. : 1.00 Min. :1.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 8.00 1st Qu.:2.000 1st Qu.: 9.00 1st Qu.:15.00
## Median :16.00 Median :4.000 Median :14.00 Median :30.00
## Mean :15.69 Mean :4.122 Mean :13.51 Mean :29.57
## 3rd Qu.:23.00 3rd Qu.:6.000 3rd Qu.:19.00 3rd Qu.:45.00
## Max. :31.00 Max. :7.000 Max. :23.00 Max. :59.00
## dis lgds lgde jfks
## Min. : 0.0008 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.: 1.1293 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 1.9378 Median :0.0000 Median :0.0000 Median :0.00000
## Mean : 3.0193 Mean :0.1569 Mean :0.1748 Mean :0.01632
## 3rd Qu.: 3.5259 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :381.9786 Max. :1.0000 Max. :1.0000 Max. :1.00000
## jfke ewrs ewre
## Min. :0.00000 Min. :0.0000000 Min. :0.000000
## 1st Qu.:0.00000 1st Qu.:0.0000000 1st Qu.:0.000000
## Median :0.00000 Median :0.0000000 Median :0.000000
## Mean :0.01149 Mean :0.0003037 Mean :0.001665
## 3rd Qu.:0.00000 3rd Qu.:0.0000000 3rd Qu.:0.000000
## Max. :1.00000 Max. :1.0000000 Max. :1.000000
##########################################################
#####visual######
a_map<-a_train
ggplot(a_train, aes(fare_amount))+
geom_histogram(fill = "yellow", bins = 50)+
ggtitle("Distribution of Fare Amount")+
theme(plot.title = element_text(hjust = .5),)
ggplot(a_train,aes(dis))+
geom_density(col = "blue")+
ggtitle("Density of Training Distance")+
theme(plot.title = element_text(hjust = .5))+
scale_x_continuous(limits=c(0, 40))
## Warning: Removed 306 rows containing non-finite values (stat_density).
a_map$abslog<-abs(a_map$pickup_longitude-a_map$dropoff_longitude)
a_map$abslat<-abs(a_map$pickup_latitude-a_map$dropoff_latitude)
ggplot(data=a_map,aes(x=abslat,y=abslog,color=fare_amount))+geom_point(size=1)+scale_x_continuous(limits = c(0, 1))+scale_y_continuous(limits = c(0, 1))
## Warning: Removed 142 rows containing missing values (geom_point).
ggplot(data=a_map,aes(x=dis,y=fare_amount))+geom_point(size=1,color='blue')+scale_x_continuous(limits = c(0, 20))+scale_y_continuous(limits = c(0, 150))+labs(title='distance<20 and fare<150')
## Warning: Removed 2013 rows containing missing values (geom_point).
a_map%>%
filter(dis<30)%>%
ggplot(aes(dis))+
geom_density(col = "blue")+
ggtitle("Density of Distance")+
theme(plot.title = element_text(hjust = .5))+
scale_x_continuous(limits=c(0, 40))
ggplot(data=a_map,aes(x=hour,y=fare_amount))+geom_point(size=1,color='red')+labs(title='relationship between hour and fare')